clear; clc; close all;
% load the data
load ml_data
% Remove the SPGrade variable :
ml_data(:,'SPGGrade') = [];
% Store the target variable
ml_data_output = ml_data.SPGScore;
% Remove the class variable : SPGScore;
ml_data(:,'SPGScore') = [];
% Scale the data; Normalize it; use zscore
ml_data{:,:} = zscore(ml_data{:,:});
% Split the data into training and test sets
% Create the cvpartition variable
pt = cvpartition(ml_data_output, 'HoldOut', 0.25);
% Create the training and test tables
nc_train_input = ml_data(training(pt), :);
nc_train_output = ml_data_output(training(pt), :);
nc_test_input = ml_data(test(pt), :);
nc_test_output = ml_data_output(test(pt), :);
% set random seed.
rng(1);
Fitrtree
Predict
Calculate error
Calculate R-square
Plot Error Histogram
Plot Scatter Plot
Mdl_rtree = fitrtree(nc_train_input,nc_train_output,'OptimizeHyperparameters',{'MinLeafSize', 'MaxNumSplits'},...
'HyperparameterOptimizationOptions',struct('AcquisitionFunctionName',...
'expected-improvement-plus'))
% predict
outputs_rtree_train=predict(Mdl_rtree,nc_train_input);
outputs_rtree_test=predict(Mdl_rtree,nc_test_input);
%--------------------------------------------------------------------------
% calculate the mean square error (MSE) of the test points
mse_train=sum((outputs_rtree_train - nc_train_output).^2)/length(nc_train_output);
mse_test=sum((outputs_rtree_test - nc_test_output).^2)/length(nc_test_output);
%--------------------------------------------------------------------------
% calculate the correlation coefficients for the training and test data
% sets with the associated linear fits hint: check out the function corrcoef
R_train = corrcoef(outputs_rtree_train,nc_train_output);
R_test = corrcoef(outputs_rtree_test,nc_test_output);
r_train=R_train(1,2);
r_test=R_test(1,2);
% plot error histogram
plotErrorHistogram(nc_train_output, outputs_rtree_train, nc_test_output, outputs_rtree_test)
% plot scatter plot
plotScatterDiagram(nc_train_output, outputs_rtree_train, nc_test_output, outputs_rtree_test, r_train, r_test)
SVM
fitrlinear efficiently trains linear regression models with high-dimensional, full or sparse predictor data. Available linear regression models include regularized support vector machines (SVM) and least-squares regression methods. fitrlinear minimizes the objective function using techniques that reduce computing time (e.g., stochastic gradient descent).
A high-dimensional data set includes many predictor variables. Although such a data set can consume a significant fraction of memory, it must fit in the MATLABĀ® Workspace. For low- through medium-dimensional predictor data sets, see Alternatives for Lower-Dimensional Data.
hyperopts = struct('AcquisitionFunctionName','expected-improvement-plus');
nc_train_input_matrix = nc_train_input{:,:};
[Mdl_svm,FitInfo,HyperparameterOptimizationResults] = fitrlinear(nc_train_input_matrix,nc_train_output,...
'OptimizeHyperparameters','auto',...
'HyperparameterOptimizationOptions',hyperopts)
% predict
outputs_svm_train=predict(Mdl_svm,nc_train_input_matrix);
nc_test_input_matrix = nc_test_input{:,:};
outputs_svm_test=predict(Mdl_svm,nc_test_input_matrix);
% plot error histogram
plotErrorHistogram(nc_train_output, outputs_svm_train, nc_test_output, outputs_svm_test)
%--------------------------------------------------------------------------
% calculate the mean square error (MSE) of the test points
mse_train=sum((outputs_svm_train - nc_train_output).^2)/length(nc_train_output);
mse_test=sum((outputs_svm_test - nc_test_output).^2)/length(nc_test_output);
%--------------------------------------------------------------------------
% calculate the correlation coefficients for the training and test data
% sets with the associated linear fits hint: check out the function corrcoef
R_train = corrcoef(outputs_svm_train,nc_train_output);
R_test = corrcoef(outputs_svm_test,nc_test_output);
r_train=R_train(1,2);
r_test=R_test(1,2);
% plot scatter plot
plotScatterDiagram(nc_train_output, outputs_rtree_train, nc_test_output, outputs_rtree_test, r_train, r_test)
TreeBagger Regression
Mdl_TB = TreeBagger(...
100,nc_train_input,nc_train_output,...
'Method','Regression',...
'Surrogate','on',...
'PredictorSelection','curvature',...
'OOBPredictorImportance','on'...
)
% predict
outputs_tb_train=predict(Mdl_TB, nc_train_input);
outputs_tb_test=predict(Mdl_TB,nc_test_input);
% plot error histogram
plotErrorHistogram(nc_train_output, outputs_tb_train, nc_test_output, outputs_tb_test)
%--------------------------------------------------------------------------
% calculate the mean square error (MSE) of the test points
mse_train=sum((outputs_tb_train - nc_train_output).^2)/length(nc_train_output);
mse_test=sum((outputs_tb_test - nc_test_output).^2)/length(nc_test_output);
%--------------------------------------------------------------------------
% calculate the correlation coefficients for the training and test data
% sets with the associated linear fits hint: check out the function corrcoef
R_train = corrcoef(outputs_tb_train,nc_train_output);
R_test = corrcoef(outputs_tb_test,nc_test_output);
r_train=R_train(1,2);
r_test=R_test(1,2);
% plot scatter plot
plotScatterDiagram(nc_train_output, outputs_tb_train, nc_test_output, outputs_tb_test, r_train, r_test)
%--------------------------------------------------------------------------
% Estimate the predictor importance
imp=Mdl_TB.OOBPermutedPredictorDeltaError;
[sorted_imp,isorted_imp] = sort(imp,'descend');
n = sum(imp>0);
if n > 31
n = 31;
end
%--------------------------------------------------------------------------
% Draw a horizontal bar chart showing the variables in descending order of
% importance. Hint: look up the function barh.
% Label each variable with its name.
% Hints: (1) Look up the function text. (2) Variable names are held in
% Mdl.PredictorNames
figure;barh(imp(isorted_imp(1:n)));hold on;grid on;
barh(imp(isorted_imp(1:5)),'y');barh(imp(isorted_imp(1:3)),'r');
title('Predictor Importance Estimates');
xlabel('Estimates with Curvature Tests');ylabel('Predictors');
set(gca,'FontSize',20); set(gca,'TickDir','out'); set(gca,'LineWidth',2);
ax = gca;ax.YDir='reverse';ax.XScale = 'log';
sorted_predictor_names = Mdl_TB.PredictorNames(isorted_imp(1:n));
% label the bars
for i=1:length(sorted_predictor_names)
text(...
1.05*imp(isorted_imp(i)),i,...
strrep(sorted_predictor_names{i},'_',''),...
'FontSize',12 ...
)
end
print('-dpng','NC-full-input-importance.png');% save to an png file
for col=sorted_predictor_names
disp(col)
end
TreeBagger Occum's Razor (Reduced Model)
% Model using the best predictors
nc_train_input_simpler = nc_train_input(:, isorted_imp(1:n));
nc_test_input_simpler = nc_test_input(:, isorted_imp(1:n));
% set random seed.
rng(1);
Mdl_TB_simpler = TreeBagger(...
100,nc_train_input_simpler,nc_train_output,...
'Method','Regression',...
'Surrogate','on',...
'PredictorSelection','curvature',...
'OOBPredictorImportance','on'...
)
% predict
outputs_tb_train_simpler=predict(Mdl_TB_simpler, nc_train_input_simpler);
outputs_tb_test_simpler=predict(Mdl_TB_simpler,nc_test_input_simpler);
% plot error histogram
plotErrorHistogram(nc_train_output, outputs_tb_train_simpler, nc_test_output, outputs_tb_test_simpler)
%--------------------------------------------------------------------------
% calculate the mean square error (MSE) of the test points
mse_train=sum((outputs_tb_train_simpler - nc_train_output).^2)/length(nc_train_output);
mse_test=sum((outputs_tb_test_simpler - nc_test_output).^2)/length(nc_test_output);
%--------------------------------------------------------------------------
% calculate the correlation coefficients for the training and test data
% sets with the associated linear fits hint: check out the function corrcoef
R_train = corrcoef(outputs_tb_train_simpler,nc_train_output);
R_test = corrcoef(outputs_tb_test_simpler,nc_test_output);
r_train=R_train(1,2);
r_test=R_test(1,2);
% plot scatter plot
plotScatterDiagram(nc_train_output, outputs_tb_train_simpler, nc_test_output, outputs_tb_test_simpler, r_train, r_test)
Ensemble Regression Models (Boosting)
%--------------------------------------------------------------------------
% Create ensemble model with Hyperparameter Optimization
Mdl_en = fitrensemble(...
nc_train_input,nc_train_output,...
'OptimizeHyperparameters','all', ...
'HyperparameterOptimizationOptions',struct('UseParallel',true,'ShowPlots',true) ...
)
% predict
outputs_en_train=predict(Mdl_en, nc_train_input);
outputs_en_test=predict(Mdl_en,nc_test_input);
% plot error histogram
plotErrorHistogram(nc_train_output, outputs_en_train, nc_test_output, outputs_en_test)
%--------------------------------------------------------------------------
% calculate the mean square error (MSE) of the test points
mse_train=sum((outputs_en_train - nc_train_output).^2)/length(nc_train_output);
mse_test=sum((outputs_en_test - nc_test_output).^2)/length(nc_test_output);
%--------------------------------------------------------------------------
% calculate the correlation coefficients for the training and test data
% sets with the associated linear fits hint: check out the function corrcoef
R_train = corrcoef(outputs_en_train,nc_train_output);
R_test = corrcoef(outputs_en_test,nc_test_output);
r_train=R_train(1,2);
r_test=R_test(1,2);
% plot scatter plot
plotScatterDiagram(nc_train_output, outputs_en_train, nc_test_output, outputs_en_test, r_train, r_test)
%--------------------------------------------------------------------------
% Estimate the predictor importance
imp=predictorImportance(Mdl_en)
[sorted_imp,isorted_imp] = sort(imp,'descend');
n = sum(imp>0);
if n > 31
n = 31;
end
%--------------------------------------------------------------------------
% Draw a horizontal bar chart showing the variables in descending order of
% importance. Hint: look up the function barh.
% Label each variable with its name.
% Hints: (1) Look up the function text. (2) Variable names are held in
% Mdl.PredictorNames
figure;barh(imp(isorted_imp(1:n)));hold on;grid on;
barh(imp(isorted_imp(1:5)),'y');barh(imp(isorted_imp(1:3)),'r');
title('Predictor Importance Estimates');
xlabel('Estimates with Curvature Tests');ylabel('Predictors');
set(gca,'FontSize',20); set(gca,'TickDir','out'); set(gca,'LineWidth',2);
ax = gca;ax.YDir='reverse';ax.XScale = 'log';
sorted_predictor_names = Mdl_en.PredictorNames(isorted_imp(1:n));
% label the bars
for i=1:length(sorted_predictor_names)
text(...
1.05*imp(isorted_imp(i)),i,...
strrep(sorted_predictor_names{i},'_',''),...
'FontSize',12 ...
)
end
print('-dpng','NC-full-input-importance.png');% save to an png file
for col=sorted_predictor_names
disp(col)
end
Ensemble Model Regression with simpler model
% Model using the best predictors
nc_train_input_simpler = nc_train_input(:, isorted_imp(1:n));
nc_test_input_simpler = nc_test_input(:, isorted_imp(1:n));
% set random seed.
rng(1);
%--------------------------------------------------------------------------
% Create ensemble model with Hyperparameter Optimization
Mdl_en_simpler = fitrensemble(...
nc_train_input_simpler,nc_train_output,...
'OptimizeHyperparameters','all', ...
'HyperparameterOptimizationOptions',struct('UseParallel',true,'ShowPlots',true) ...
)
% predict
outputs_en_train_simpler=predict(Mdl_en_simpler, nc_train_input_simpler);
outputs_en_test_simpler=predict(Mdl_en_simpler,nc_test_input_simpler);
% plot error histogram
plotErrorHistogram(nc_train_output, outputs_en_train_simpler, nc_test_output, outputs_en_test_simpler)
%--------------------------------------------------------------------------
% calculate the mean square error (MSE) of the test points
mse_train=sum((outputs_en_train_simpler - nc_train_output).^2)/length(nc_train_output);
mse_test=sum((outputs_en_test_simpler - nc_test_output).^2)/length(nc_test_output);
%--------------------------------------------------------------------------
% calculate the correlation coefficients for the training and test data
% sets with the associated linear fits hint: check out the function corrcoef
R_train = corrcoef(outputs_en_train_simpler,nc_train_output);
R_test = corrcoef(outputs_en_test_simpler,nc_test_output);
r_train=R_train(1,2);
r_test=R_test(1,2);
% plot scatter plot
plotScatterDiagram(nc_train_output, outputs_en_train_simpler, nc_test_output, outputs_en_test_simpler, r_train, r_test)
Neural Network Regression using Ensemble Boosting algorithm's best predictors.
% save the variables
% save AllRegressionLearners
% ml_data_simple_en = ml_data(:, isorted_imp(1:n));
% save Ensemble_Regression_Simple.mat ml_data_simple_en
Prepare data for sending it into a neural network
% Commands to create Ensemble Neural Network data with simple model
clear; close all; clc;
load Ensemble_Regression_Simple.mat
target = ml_data_output;
X_EN = ml_data_simple_en{:,:};
X_EN = X_EN';
Y_EN = target;
Y_EN = Y_EN';
save nn_reg_en.mat X_EN Y_EN;
run('en_reg_nn.m')
run('en_reg_nn_best.m')
Neural Network Regression using TreeBagger algorithm's best predictors.
%ml_data_simple_tb = ml_data(:, isorted_imp(1:n));
%save TreeBagger_Regression_Simple.mat ml_data_simple_tb
% Commands to create Ensemble Neural Network data with simple model
clear; close all; clc;
load TreeBagger_Regression_Simple.mat;
target = ml_data_output;
X_TB = ml_data_simple_tb{:,:};
X_TB = X_TB';
Y_TB = target;
Y_TB = Y_TB';
save nn_reg_tb.mat X_TB Y_TB;
run('tb_reg_nn.m')
run('tb_reg_nn_best.m')